// Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */

/*
 * This code has been contributed by the DARPA HPCS program.  Contact
 * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu>
 * if you have questions.
 *
 *
 * GUPS (Giga UPdates per Second) is a measurement that profiles the memory
 * architecture of a system and is a measure of performance similar to MFLOPS.
 * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the
 * GUPS capability of a system, much like the LINPACK benchmark is intended to
 * exercise the MFLOPS capability of a computer.  In each case, we would
 * expect these benchmarks to achieve close to the "peak" capability of the
 * memory system. The extent of the similarities between RandomAccess and
 * LINPACK are limited to both benchmarks attempting to calculate a peak system
 * capability.
 *
 * GUPS is calculated by identifying the number of memory locations that can be
 * randomly updated in one second, divided by 1 billion (1e9). The term "randomly"
 * means that there is little relationship between one address to be updated and
 * the next, except that they occur in the space of one half the total system
 * memory.  An update is a read-modify-write operation on a table of 64-bit words.
 * An address is generated, the value at that address read from memory, modified
 * by an integer operation (add, and, or, xor) with a literal value, and that
 * new value is written back to memory.
 *
 * We are interested in knowing the GUPS performance of both entire systems and
 * system subcomponents --- e.g., the GUPS rating of a distributed memory
 * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a
 * single processor.  While there is typically a scaling of FLOPS with processor
 * count, a similar phenomenon may not always occur for GUPS.
 *
 * Select the memory size to be the power of two such that 2^n <= 1/2 of the
 * total memory.  Each CPU operates on its own address stream, and the single
 * table may be distributed among nodes. The distribution of memory to nodes
 * is left to the implementer.  A uniform data distribution may help balance
 * the workload, while non-uniform data distributions may simplify the
 * calculations that identify processor location by eliminating the requirement
 * for integer divides. A small (less than 1%) percentage of missed updates
 * are permitted.
 *
 * When implementing a benchmark that measures GUPS on a distributed memory
 * multiprocessor system, it may be required to define constraints as to how
 * far in the random address stream each node is permitted to "look ahead".
 * Likewise, it may be required to define a constraint as to the number of
 * update messages that can be stored before processing to permit multi-level
 * parallelism for those systems that support such a paradigm.  The limits on
 * "look ahead" and "stored updates" are being implemented to assure that the
 * benchmark meets the intent to profile memory architecture and not induce
 * significant artificial data locality. For the purpose of measuring GUPS,
 * we will stipulate that each thread is permitted to look ahead no more than
 * 1024 random address stream samples with the same number of update messages
 * stored before processing.
 *
 * The supplied MPI-1 code generates the input stream {A} on all processors
 * and the global table has been distributed as uniformly as possible to
 * balance the workload and minimize any Amdahl fraction.  This code does not
 * exploit "look-ahead".  Addresses are sent to the appropriate processor
 * where the table entry resides as soon as each address is calculated.
 * Updates are performed as addresses are received.  Each message is limited
 * to a single 64 bit long integer containing element ai from {A}.
 * Local offsets for T[ ] are extracted by the destination processor.
 *
 * If the number of processors is equal to a power of two, then the global
 * table can be distributed equally over the processors.  In addition, the
 * processor number can be determined from that portion of the input stream
 * that identifies the address into the global table by masking off log2(p)
 * bits in the address.
 *
 * If the number of processors is not equal to a power of two, then the global
 * table cannot be equally distributed between processors.  In the MPI-1
 * implementation provided, there has been an attempt to minimize the differences
 * in workloads and the largest difference in elements of T[ ] is one.  The
 * number of values in the input stream generated by each processor will be
 * related to the number of global table entries on each processor.
 *
 * The MPI-1 version of RandomAccess treats the potential instance where the
 * number of processors is a power of two as a special case, because of the
 * significant simplifications possible because processor location and local
 * offset can be determined by applying masks to the input stream values.
 * The non power of two case uses an integer division to determine the processor
 * location.  The integer division will be more costly in terms of machine
 * cycles to perform than the bit masking operations
 *
 * For additional information on the GUPS metric, the HPCchallenge RandomAccess
 * Benchmark,and the rules to run RandomAccess or modify it to optimize
 * performance -- see http://icl.cs.utk.edu/hpcc/
 *
 */

#include <assert.h>
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#define DEFAULT_LOGN 29
#define ACCESS_PER_ELEM_GMEM_U_UNL_RW 8
#define ACCESS_PER_ELEM_GMEM_R_W 32
#define ACCESS_PER_ELEM_SHMEM 65536
#define NUM_THREADS_PER_BLOCK_GMEM 1024
// Number of threads for shared memory GUPS may not be changed for best perf
#define NUM_THREADS_PER_BLOCK_SHMEM 1024
#define POLY 0x0000000000000007ULL

// Switch to compile-time shared memory allocation otherwise collected numbers
// from shmem are instruction bound (not real gups)
#ifdef STATIC_SHMEM

// Need to manually set correct value for compile time shmem allocation
// Handled through makefile (currently only SM80 and SM90 are supported)
// Must satisfy NSHMEM = prop.sharedMemPerBlockOptin / sizeof(benchtype);
#if __CUDA_ARCH__ == 800
#define NSHMEM 20864
#elif __CUDA_ARCH__ == 900
#define NSHMEM 29056
#else
// not supported, fall back to dynamic shmem allocation
#undef STATIC_SHMEM
#endif

#endif

union benchtype
{
  uint64_t data;
};
static __constant__ uint64_t c_m2[64];

#ifndef CUDA_RT_CALL
#define CUDA_RT_CALL(call)                                                         \
  {                                                                                \
    cudaError_t cudaStatus = call;                                                 \
    if (cudaSuccess != cudaStatus)                                                 \
      fprintf(                                                                     \
          stderr,                                                                  \
          "ERROR: CUDA RT call \"%s\" in line %d of file %s failed with %s "       \
          "(%d).\n",                                                               \
          #call,                                                                   \
          __LINE__,                                                                \
          __FILE__,                                                                \
          cudaGetErrorString(cudaStatus),                                          \
          cudaStatus);                                                             \
  }
#endif

static __global__ void d_init(size_t n, benchtype* t)
{
  for (ptrdiff_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
       i += gridDim.x * blockDim.x) {
    t[i].data = i;
  }
}

static __device__ uint64_t d_starts(size_t n)
{
  if (n == 0) {
    return 1;
  }

  int i = 63 - __clzll(n);

  uint64_t ran = 2;
  while (i > 0) {
    uint64_t temp = 0;
    for (int j = 0; j < 64; j++) {
      if ((ran >> j) & 1) {
        temp ^= c_m2[j];
      }
    }
    ran = temp;
    i -= 1;
    if ((n >> i) & 1) {
      ran = (ran << 1) ^ ((int64_t)ran < 0 ? POLY : 0);
    }
  }

  return ran;
}

enum test_t
{
  UPDATE = 0,
  READ,
  WRITE,
  READ_WRITE,
  UPDATE_NO_LOOP,
};

const char* test_name[] = {
    "Gupdates/s ATOM.CAS.64(loop)",
    "Greads/s LD.64",
    "Gwrites/s STG.64",
    "Greads+writes/s LDG.64+STG.64",
    "Gupdates/s ATOM.CAS.64(no_loop)",
};

template <test_t T_TYPE>
__global__ void
d_bench(size_t n, size_t working_set, benchtype* t, int accesses_per_elem)
{
  size_t num_threads = gridDim.x * blockDim.x;
  size_t thread_num = blockIdx.x * blockDim.x + threadIdx.x;
  size_t start = thread_num * accesses_per_elem * n / num_threads;
  size_t end = (thread_num + 1) * accesses_per_elem * n / num_threads;
  benchtype ran;
  ran.data = d_starts(start);

#pragma unroll
  for (ptrdiff_t i = start; i < end; ++i) {
    ran.data = (ran.data << 1) ^ ((int64_t)ran.data < 0 ? POLY : 0);
    unsigned long long int *address, old, assumed;
    address = (unsigned long long int*)&t[ran.data & (working_set - 1)].data;
    switch (T_TYPE) {
    case READ:
      old = *address;
      if (old == n) { // basically never executes
        *address = n + 1;
      }
      break;
    case WRITE:
      *address = 1;
      break;
    case READ_WRITE:
      *address += 1;
      break;
    case UPDATE_NO_LOOP:
      old = *address;
      assumed = old;
      old = atomicCAS(address, assumed, assumed ^ ran.data);
      break;
    case UPDATE:
      old = *address;
      do {
        assumed = old;
        old = atomicCAS(address, assumed, assumed ^ ran.data);
      } while (assumed != old);
      break;
    }
  }
}

template <test_t T_TYPE>
__global__ void d_bench_shmem(size_t n_shmem, int accesses_per_elem_sh)
{
  extern __shared__ benchtype extern_mem[];
  benchtype* t = (benchtype*)&extern_mem;

  size_t num_threads = gridDim.x * blockDim.x;
  size_t thread_num = blockIdx.x * blockDim.x + threadIdx.x;
#ifdef STATIC_SHMEM
  // ignore any compiler warnings about not using n_shmem!
  const size_t n_shmem_l = NSHMEM;
#else
  const size_t n_shmem_l = n_shmem;
#endif
  size_t start = thread_num * accesses_per_elem_sh * n_shmem_l / num_threads;
  size_t end = (thread_num + 1) * accesses_per_elem_sh * n_shmem_l / num_threads;
  benchtype ran;
  ran.data = d_starts(start);

#pragma unroll
  for (ptrdiff_t i = start; i < end; ++i) {
    ran.data = (ran.data << 1) ^ ((int64_t)ran.data < 0 ? POLY : 0);
    unsigned long long int *address, old, assumed;
    address = (unsigned long long int*)&t[ran.data % n_shmem_l].data;
    switch (T_TYPE) {
    case READ:
      old = *address;
      if (old == n_shmem_l) { // basically never executes
        *address = n_shmem_l + 1;
      }
      break;
    case WRITE:
      *address = 1;
      break;
    case READ_WRITE:
      *address += 1;
      break;
    case UPDATE_NO_LOOP:
      old = *address;
      assumed = old;
      old = atomicCAS(address, assumed, assumed ^ ran.data);
      break;
    case UPDATE:
      old = *address;
      do {
        assumed = old;
        old = atomicCAS(address, assumed, assumed ^ ran.data);
      } while (assumed != old);
      break;
    }
  }
}

static __global__ void d_check(size_t n, benchtype* t, uint32_t* d_error)
{
  for (ptrdiff_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
       i += gridDim.x * blockDim.x) {
    if (t[i].data != i) {
      atomicAdd(d_error, 1);
    }
  }
}

static void starts()
{
  uint64_t m2[64];
  uint64_t temp = 1;
  for (ptrdiff_t i = 0; i < 64; i++) {
    m2[i] = temp;
    temp = (temp << 1) ^ ((int64_t)temp < 0 ? POLY : 0);
    temp = (temp << 1) ^ ((int64_t)temp < 0 ? POLY : 0);
  }
  CUDA_RT_CALL(cudaMemcpyToSymbol(c_m2, m2, sizeof(m2)));
}

int main(int argc, char* argv[])
{
  int logn = DEFAULT_LOGN;
  test_t test_type = UPDATE;
  int dev = 0;
  bool shared_mem = false;
  int logn_shmem = -1;

  int occupancy = 100;
  int repeats = 1;
  int accesses_per_elem = -1;
  int accesses_per_elem_sh = -1;

  const char* opts_desc[7] = {
      "  -n <int> input data size = 2^n [default: 29]",
      "  -o <int> occupancy percentage, 100/occupancy how much larger the "
      "working set is compared to the requested bytes [default: 100]",
      "  -r <int> number of kernel repetitions [default: 1]",
      "  -a <int> number of random accesses per input element [default: "
      " 32 (r, w) or 8 (u, unl, rw) for gmem, 65536 for shmem]",
      "  -t <int> test type (0 - update (u), 1 - read (r), 2 - write (w), 3 - read "
      "write (rw), "
      "4 - update no loop (unl)) [default: 0]",
      "  -d <int> device ID to use [default: 0]",
      "  -s <int> enable input in shared memory instead of global memory for "
      "shared memory GUPS benchmark if s>=0. The benchmark will use max available "
      "shared memory if s=0 (for ideal GUPS conditions this must be done at "
      "compile time, check README.md for build options). This tool does allow "
      "setting the shmem data size with = 2^s (for s>0), however this will "
      "also result in an instruction bound kernel that fails to reach "
      "hardware limitations of GUPS. [default: -1 (disabled)]",
  };

  int c;
  while ((c = getopt(argc, argv, "n:o:r:a:t:d:s:h")) != -1) {
    switch (c) {
    case 'n':
      logn = atoi(optarg);
      break;
    case 'o':
      occupancy = atoi(optarg);
      break;
    case 'r':
      repeats = atoi(optarg);
      break;
    case 'a':
      accesses_per_elem = accesses_per_elem_sh = atoi(optarg);
      break;
    case 't':
      test_type = static_cast<test_t>(atoi(optarg));
      break;
    case 'd':
      dev = atoi(optarg);
      break;
    case 's':
      shared_mem = true;
      logn_shmem = atoi(optarg);
      break;
    case '?':
      printf("Please use -h to get option list.\n");
      return 1;
    case 'h':
      printf("Usage:\n");
      for (int i = 0; i < 7; i++) {
        printf("%s\n", opts_desc[i]);
      }
      return 0;
    default:
      break;
    }
  }

  size_t n_shmem = 1UL << logn_shmem;

  int ndev;
  CUDA_RT_CALL(cudaGetDeviceCount(&ndev));
  if (dev < 0 || dev >= ndev) {
    dev = 0;
  }

  cudaDeviceProp prop;
  CUDA_RT_CALL(cudaGetDeviceProperties(&prop, dev));
  CUDA_RT_CALL(cudaSetDevice(dev));
  printf("Using GPU %d of %d GPUs.\n", dev, ndev);
  printf("Warp size = %d.\n", prop.warpSize);
  printf("Multi-processor count = %d.\n", prop.multiProcessorCount);
  printf(
      "Max threads per multi-processor = %d.\n", prop.maxThreadsPerMultiProcessor);

  if (shared_mem) {
    if (n_shmem == 1) {
      printf("Using max shared memory\n");
#ifdef STATIC_SHMEM
      n_shmem = NSHMEM;
#else
      n_shmem = prop.sharedMemPerBlockOptin / sizeof(benchtype);
#endif
    } else {
      printf(
          "Shared memory size = %zu (%zu bytes.)\n",
          n_shmem,
          n_shmem * sizeof(benchtype));
    }
    // using the dynamic allocation doesn't appear to impact performance so will
    // always use max.
    printf(
        "Max shared memory per block = %zu Bytes.\n", prop.sharedMemPerBlockOptin);
    if (n_shmem * sizeof(benchtype) > prop.sharedMemPerBlockOptin) {
      fprintf(stderr, "Requested shmem size not supported!\n");
      exit(-1);
    }
#ifdef STATIC_SHMEM
    assert(prop.sharedMemPerBlockOptin / sizeof(benchtype) == NSHMEM);
#endif
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<READ>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<WRITE>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<UPDATE>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<READ_WRITE>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<UPDATE_NO_LOOP>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
    CUDA_RT_CALL(cudaFuncSetAttribute(
        d_bench_shmem<UPDATE>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        prop.sharedMemPerBlockOptin));
  }

  size_t n = 1UL << logn;
  size_t working_set = n * 100 / occupancy;
  if (accesses_per_elem == -1) {
    if (test_type == UPDATE || test_type == UPDATE_NO_LOOP
        || test_type == READ_WRITE) {
      accesses_per_elem = ACCESS_PER_ELEM_GMEM_U_UNL_RW;
    } else {
      accesses_per_elem = ACCESS_PER_ELEM_GMEM_R_W;
    }
  }
  if (accesses_per_elem_sh == -1) {
    accesses_per_elem_sh = ACCESS_PER_ELEM_SHMEM;
  }
  size_t thread, grid;
  if (!shared_mem) {
    thread = NUM_THREADS_PER_BLOCK_GMEM;
    // determine numebr of blocks based on input size
    grid = ceil(1.0 * n / thread);
  } else {
    thread = NUM_THREADS_PER_BLOCK_SHMEM;
    // ensure max occupancy
    grid = prop.multiProcessorCount
           * floor(prop.sharedMemPerBlockOptin / (n_shmem * sizeof(benchtype)));
  }
  size_t total_num_thread = thread * grid;

  if (!shared_mem) {
    printf(
        "Table size = %zu (%lf GB.)\nTotal number of threads %zu\nEach thread "
        "access %d locations.\nNumber of iterations = %d\n",
        working_set,
        working_set * sizeof(benchtype) / 1e9,
        total_num_thread,
        accesses_per_elem,
        repeats);
  } else {
    size_t total_shmem = grid * n_shmem * sizeof(benchtype);
    printf(
        "Table size = %zu (%lf MB.) [shared memory: %zu bytes per block x %zu blocks]\n"
        "Total number of threads %zu\nEach thread "
        "access %d locations.\nNumber of iterations = %d\n",
        total_shmem / sizeof(benchtype),
        total_shmem / 1e6,
        n_shmem * sizeof(benchtype),
        grid,
        total_num_thread,
        accesses_per_elem_sh,
        repeats);
  }

  benchtype* d_t;
  if (!shared_mem)
    CUDA_RT_CALL(cudaMalloc((void**)&d_t, working_set * sizeof(benchtype)));

  cudaEvent_t begin, end;
  CUDA_RT_CALL(cudaEventCreate(&begin));
  CUDA_RT_CALL(cudaEventCreate(&end));

  if (!shared_mem) {
    d_init<<<grid, thread>>>(working_set, d_t);
  }
  starts();

  CUDA_RT_CALL(cudaEventRecord(begin));
  CUDA_RT_CALL(cudaEventSynchronize(begin));

  if (!shared_mem) {
    for (int i = 0; i < repeats; i++) {
      if (test_type == READ) {
        d_bench<READ><<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
      } else if (test_type == WRITE) {
        d_bench<WRITE><<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
      } else if (test_type == UPDATE) {
        d_bench<UPDATE><<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
      } else if (test_type == READ_WRITE) {
        d_bench<READ_WRITE>
            <<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
      } else if (test_type == UPDATE_NO_LOOP) {
        d_bench<UPDATE_NO_LOOP>
            <<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
      } else {
        printf("Test currently not supported.");
      }
    }
  } else {
    printf(
        "Shmem launch config: %zu %zu %zu\n",
        grid,
        thread,
        n_shmem * sizeof(benchtype));
    for (int i = 0; i < repeats; i++) {
      if (test_type == READ) {
        d_bench_shmem<READ><<<grid, thread, n_shmem * sizeof(benchtype)>>>(
            n_shmem, accesses_per_elem_sh);
      } else if (test_type == WRITE) {
        d_bench_shmem<WRITE><<<grid, thread, n_shmem * sizeof(benchtype)>>>(
            n_shmem, accesses_per_elem_sh);
      } else if (test_type == UPDATE) {
        d_bench_shmem<UPDATE><<<grid, thread, n_shmem * sizeof(benchtype)>>>(
            n_shmem, accesses_per_elem_sh);
      } else if (test_type == READ_WRITE) {
        d_bench_shmem<READ_WRITE><<<grid, thread, n_shmem * sizeof(benchtype)>>>(
            n_shmem, accesses_per_elem_sh);
      } else if (test_type == UPDATE_NO_LOOP) {
        d_bench_shmem<UPDATE_NO_LOOP>
            <<<grid, thread, n_shmem * sizeof(benchtype)>>>(
                n_shmem, accesses_per_elem_sh);
      } else {
        printf("Test currently not supported.");
      }
    }
  }

  CUDA_RT_CALL(cudaEventRecord(end));
  CUDA_RT_CALL(cudaEventSynchronize(end));

  float ms;
  CUDA_RT_CALL(cudaEventElapsedTime(&ms, begin, end));
  CUDA_RT_CALL(cudaEventDestroy(end));
  CUDA_RT_CALL(cudaEventDestroy(begin));
  double time = ms * 1.0e-3;
  printf("Elapsed time = %.6f seconds\n", time);
  if (!shared_mem) {
    double result = accesses_per_elem * n * repeats / (double)ms * 1.0e-6;
    printf("Result %s = %.6f\n", test_name[test_type], result);
  } else {
    double result = accesses_per_elem_sh * n_shmem * repeats / (double)ms * 1.0e-6;
    printf("Result %s(shmem_combined_SMs) = %.6f\n", test_name[test_type], result);
    printf(
        "Result %s(shmem_single_SM) = %.6f\n",
        test_name[test_type],
        result / prop.multiProcessorCount);
  }

  uint32_t* d_error;
  CUDA_RT_CALL(cudaMalloc((void**)&d_error, sizeof(uint32_t)));
  CUDA_RT_CALL(cudaMemset(d_error, 0, sizeof(uint32_t)));

  if (test_type == UPDATE && !shared_mem) {
    // d_check only works with UPDATE operation
    d_bench<UPDATE><<<grid, thread>>>(n, working_set, d_t, accesses_per_elem);
    d_check<<<grid, thread>>>(working_set, d_t, d_error);
    uint32_t h_error;
    CUDA_RT_CALL(
        cudaMemcpy(&h_error, d_error, sizeof(uint32_t), cudaMemcpyDeviceToHost));
    printf("Verification: Found %u errors.\n", h_error);
  }

  CUDA_RT_CALL(cudaFree(d_error));
  if (!shared_mem)
    CUDA_RT_CALL(cudaFree(d_t));
  return 0;
}

